## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'plotly'
##
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
##
## The following object is masked from 'package:stats':
##
## filter
##
##
## The following object is masked from 'package:graphics':
##
## layout
Imported demographic, HIV test, and sexual behavior dataframes 1999 - 2016.
demo_list =
list.files(path = "NHANES/DEMO", full.names = TRUE)
hiv_list =
list.files(path = "NHANES/HIV", full.names = TRUE)
sxq_list =
list.files(path = "NHANES/SXQ", full.names = TRUE)
load_files = function(x){
filetype =
str_extract(x, "DEMO|HIV|SXQ")
year =
str_extract(x, "(?<=_)[A-Z]")
data =
read_xpt(x) |>
janitor::clean_names()|>
mutate(filetype = filetype, year = year)
}
demo_list_output =
map(demo_list, load_files)
hiv_list_output =
map(hiv_list, load_files)
sxq_list_output =
map(sxq_list, load_files)
join_pair = function(df1, df2) {
left_join(df1, df2, by = c("seqn", "year"))
}
hiv_demo_list =
map2(hiv_list_output, demo_list_output, ~join_pair(.x, .y))
hiv_demo_sxq_df =
map2(hiv_demo_list, sxq_list_output, ~join_pair(.x, .y)) |>
bind_rows()
Re-coded variables of interest: Restricted particpants’s age between 20 and 49
cleaned_joined_df =
hiv_demo_sxq_df |>
filter(ridageyr >= 20 & ridageyr <= 49) |>
mutate(hiv = case_when(
lbdhi == 1 | lbxhivc == 1 ~ 1,
lbdhi == 2 | lbxhivc == 2 ~ 2)
)|>
mutate(MSM = case_when(
year %in% c("A","B","C") ~ sxq220,
year %in% c("D","E","F","G","H","I") ~ sxq550)
)|>
mutate(WSW = case_when(
year %in% c("A","B","C") ~ sxq150,
year %in% c("D","E","F","G","H","I") ~ sxq490)
)|>
mutate(year = recode(year,
"A" = "1999-2000",
"B" = "2001-2002",
"C" = "2003-2004",
"D" = "2005-2006",
"E" = "2007-2008",
"F" = "2009-2010",
"G" = "2011-2012",
"H" = "2013-2014",
"I" = "2015-2016"),
hiv = recode(hiv,
"1" = "Reactive",
"2" = "Non-reactive"),
gender = recode(riagendr,
"1" = "Male",
"2" = "Female"),
age = ridageyr,
race = recode(ridreth1,
"1" = "Mexican American",
"2" = "Other Hispanic",
"3" = "Non-Hispanic White",
"4" = "Non-Hispanic Black",
"5" = "Other Race - Including Multi-Racial"),
education = recode(dmdeduc2,
"1" = "Less than 9th grade",
"2" = "9-11th grade",
"3" = "High school graduate or equivalent",
"4" = "Some college or AA degree",
"5" = "College graduate or above",
"7" = NA_character_,
"9" = NA_character_),
marriage = recode(dmdmartl,
"1" = "Married",
"2" = "Widowed",
"3" = "Divorced",
"4" = "Separated",
"5" = "Never married",
"6" = "Living with partner",
"77" = NA_character_,
"99" = NA_character_)
) |>
mutate(samesexcontact = case_when(
(MSM %in% c(0, 7777, 77777,9999, 99999) | WSW %in% c(0, 7777, 77777, 9999, 99999)) ~ 0,
(MSM >= 1 & MSM <= 600) | (WSW >= 1 & WSW <= 600) ~ 1
)) |>
select(seqn, hiv, age, gender, race, education, marriage, year, samesexcontact)
cleaned_joined_df |>
group_by(samesexcontact, hiv) |>
summarize(n = n())
## `summarise()` has grouped output by 'samesexcontact'. You can override using
## the `.groups` argument.
## # A tibble: 9 × 3
## # Groups: samesexcontact [3]
## samesexcontact hiv n
## <dbl> <chr> <int>
## 1 0 Non-reactive 6826
## 2 0 Reactive 33
## 3 0 <NA> 137
## 4 1 Non-reactive 572
## 5 1 Reactive 48
## 6 1 <NA> 15
## 7 NA Non-reactive 15589
## 8 NA Reactive 43
## 9 NA <NA> 1136
Visualization - 1 Total number as y-axis value Across race
cleaned_joined_df %>%
filter(hiv == "Reactive") %>%
group_by(race, gender) %>%
summarize(n = n(), age_mean = round(mean(age), 2)) %>%
ungroup() %>%
mutate(race = fct_reorder(race, n)) %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~race, y = ~n, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Race"),
yaxis = list(title = "Total Number of HIV Positive Patients"),
title = "The Total Number of HIV Positive Patients from 1999 to 2016 Across Races"
)
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
Visualization - 1-1 Total percent as y-axis value
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv)
) %>%
group_by(race, gender) %>%
summarize(total_par = n(), age_mean = round(mean(age), 2),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(race = fct_reorder(race, hiv_percent)) %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~race, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Race"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Races"
)
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
Visualization 2: by year
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv)) %>%
group_by(year, gender) %>%
summarize(age_mean = round(mean(age), 2),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~year, y = ~hiv_percent, color = ~gender, type = "scatter", mode = "lines+markers",
text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Year"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016"
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
Visualization 3: Across Education level
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv)
) %>%
group_by(education, gender) %>%
summarize(total_par = n(), age_mean = round(mean(age), 2),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(education = forcats::fct_relevel(education, c("Less than 9th grade", "9-11th grade",
"High school graduate or equivalent",
"Some college or AA degree",
"College graduate or above"))) %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~education, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Education level"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Education Level"
)
## `summarise()` has grouped output by 'education'. You can override using the
## `.groups` argument.
## Warning: Ignoring 2 observations
Visualization 4: Across marital status
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv)) %>%
group_by(marriage, gender) %>%
summarize(total_par = n(), age_mean = round(mean(age), 2),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(marriage = fct_reorder(marriage, hiv_percent)) %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~marriage, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Marital Status"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Marital Status"
)
## `summarise()` has grouped output by 'marriage'. You can override using the
## `.groups` argument.
## Warning: Ignoring 2 observations
Visualization 5: Sexual behavior
cleaned_joined_df %>%
mutate(hiv = ifelse(is.na(hiv), "NA", hiv),
samesexcontact = recode(samesexcontact, "0" = "No",
"1" = "Yes")) %>%
group_by(samesexcontact, gender) %>%
summarize(total_par = n(), age_mean = round(mean(age), 2), hiv_num = sum(hiv == "Reactive"),
hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
ungroup() %>%
mutate(text_label = str_c("mean age: ", age_mean)) %>%
plot_ly(x = ~samesexcontact, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>%
layout(
xaxis = list(title = "Same Sex Sexual Behavior"),
yaxis = list(title = "Total Percent of HIV Positive Patients"),
title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Sexual Behavior"
)
## `summarise()` has grouped output by 'samesexcontact'. You can override using
## the `.groups` argument.
## Warning: Ignoring 2 observations